Introduction

The purpose of this workflow is to look at the genes that have differential states across strains at the TSS.

Here we look at which states are represented at the TSS, and which genes have variation there.

We look for genes for which all strains have a single state at the TSS, and genes for which two states are represented across the strains.

We count the number of genes with each state combination at the TSS. We calculate functional enrichments for each group of genes. And we look at expression patterns for each group of genes.

library(here);library(gprofiler2);library(pheatmap);library(fgsea)
num.states = 9
is.interactive = FALSE
#is.interactive = TRUE
form.states <- paste0(num.states, "_states_C")
data.dir <- here("Data", "ChromHMM", form.states)
results.dir <- here("Results", "ChromHMM", form.states)
all.code.dir <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.code.dir)){
    all.fun <- list.files(all.code.dir[i], full.names = TRUE)
    for(j in 1:length(all.fun)){source(all.fun[j])}
}

Load data

transcript.info <- readRDS(here("Data", "RNASeq", "RNASeq_gene_info.RData"))
chrom.mats <- readRDS(file.path(results.dir, "Chromatin.States.Gene.Coords.RDS"))
#chrom.mats <- readRDS(file.path(results.dir, "Chromatin_States_9_full_gene_1000.RData"))
transformed.data.file <- here("Data", "RNASeq", "StrainsEffCts9_vst.RDS")
#expr <- readRDS(transformed.data.file)
expr <- readRDS(here("Data","RNASeq", "Strain_Mean_C_Expression.RData"))

transcript.info.file <- here("Data", "RNASeq", "RNASeq_gene_info.RData")
transcript.info <- readRDS(transcript.info.file)
col.table <- as.matrix(read.table(here("Data", "support_files", "strain.color.table.txt"), 
    sep = "\t", comment.char = "%", stringsAsFactors = FALSE))

Find the states at the TSS of each transcript

get_tss_states <- function(chrom.mat){
    if(length(chrom.mat) == 1){return(rep(NA, 9))}
    tss.idx <- get.nearest.pt(as.numeric(rownames(chrom.mat)), 0)
    return(chrom.mat[tss.idx,,drop=FALSE])
}

tss.states <- t(sapply(chrom.mats, get_tss_states))
colnames(tss.states) <- colnames(chrom.mats[[1]])

#tally which states are present there
state.counts <- t(apply(tss.states, 1, 
    function(y) sapply(1:num.states, function(x) length(which(y == x)))))
colnames(state.counts) <- 1:num.states

#find all the genes with variation
var.genes <- which(apply(state.counts, 1, function(x) length(which(x != 0)) > 1))

Enrichment of state genes across strains

#this code looks at functional enrichment for genes marked with each
#state across the strains
state.enrich.file <- file.path(results.dir, "Enrichment.by.State.and.Strain.RDS")
if(!file.exists(state.enrich.file)){
st.enrich <- vector(mode = "list", length = ncol(tss.states))
names(st.enrich) <- 1:num.states
    for(st in 1:num.states){
        if(is.interactive){print(st)}
        st.genes <- apply(tss.states, 2, function(x) which(x == st))
        st.enrich[[st]] <- lapply(st.genes, function(x) gost(names(x), organism = "mmusculus",
        sources = c("GO", "REACTOME", "KEGG")))
    }
    saveRDS(st.enrich, state.enrich.file)
}else{
    st.enrich <- readRDS(state.enrich.file)
    for(st in 1:num.states){
        cat("### State", st, "\n")
        plot.enrichment.group(st.enrich[[st]], max.term.size = 2000,
        sort.by = "p_value")
        cat("\n\n")
    }
}

State 1

State 2

State 3

State 4

State 5

State 6

State 7

State 8

State 9

State counts

There are 4210 out of 14048 total genes with variation at the TSS. The figure below shows genes in rows and states in columns. Each cell contains the number of strains that had each state at the TSS for that gene. State 7 is the most abundant state at the TSS even for genes with variation at this position. State 1 is the next most abundant.

pheatmap(state.counts, show_rownames = FALSE)

The figure below shows which states tend to co-occur at the TSS. Most of the states are negatively correlated with each other. State 7 tends to co-occur with state 8, but is very negatively correlated with state 1.

States 1, 2, and 3, all negative states, tend to co-occur, and states 4 and 5 tend to co-occur.

cor.mat <- state.counts
cor.mat[which(state.counts > 0)] <- 1
state_pairs <- pair.matrix(1:num.states, self.pairs = TRUE)
pair.names <- apply(state_pairs, 1, function(x) paste(x, collapse = "_"))

all.pair.co.occur <- vector(mode = "list", length = nrow(state_pairs))
names(all.pair.co.occur) <- pair.names
pair.cor <- matrix(NA, nrow = num.states, ncol = num.states)
rownames(pair.cor) <- colnames(pair.cor) <- 1:num.states
paired.tss.expr <- vector(mode = "list", length = nrow(state_pairs))
names(paired.tss.expr) <- pair.names
for(i in 1:nrow(state_pairs)){
    state1 <- state_pairs[i,1]
    state2 <- state_pairs[i,2]
    
    if(state1 == state2){
        state1.locale <- state2.locale <- which(state.counts[,state1] == ncol(state.counts))
    }else{
        state1.locale <- which(cor.mat[,state1] > 0)
        state2.locale <- which(cor.mat[,state2] > 0)
    }
    both.together <- intersect(state1.locale, state2.locale)
    group.genes <- rownames(cor.mat)[both.together]
    all.pair.co.occur[[i]] <- group.genes
    
    #get expression for these genes
    expr.locale <- match(group.genes, names(expr))
    has.expr <- which(sapply(expr[expr.locale], length) == 9)
    group.expr <- Reduce("rbind", expr[expr.locale[has.expr]])
    rownames(group.expr) <- group.genes[has.expr]
    #divide expression based on TSS state
    group.tss <- tss.states[c(match(rownames(group.expr), rownames(tss.states))),]
    
    #plot(as.vector(group.expr)~as.factor(as.vector(group.tss)), xlab = "State", ylab = "Expression", main = paste("State", state1, "and State", state2))

    paired.tss.expr[[i]] <- list("TSS_states" = group.tss, "Group_Expr" = group.expr)
        
    #calculate correlation of two states
    state1.freq <- length(state1.locale)/nrow(cor.mat)
    state2.freq <- length(state2.locale)/nrow(cor.mat)
    if(state1 != state2){
        both.freq <- length(both.together)/nrow(cor.mat)
        cof.rate <- both.freq - (state1.freq*state2.freq)
        pair.cor[state1, state2] <- cof.rate
        pair.cor[state2, state1] <- cof.rate
    }else{
        pair.cor[state1, state2] <- state1.freq
    }
}

if(is.interactive){quartz()}
#with individual state frequencies
imageWithText(signif(pair.cor,2), split.at.vals = TRUE, 
    col.scale = c("blue", "brown"), grad.dir = "ends")
## Loading required package: grid

#Without individual frequencies
diag(pair.cor) <- NA
imageWithText(signif(pair.cor,2), split.at.vals = TRUE, 
    col.scale = c("blue", "brown"), grad.dir = "ends", col.text.rotation = 0)

state.pair.count <- sapply(all.pair.co.occur, length)
pair.count.order <- order(state.pair.count)
if(is.interactive){quartz()}
barplot(state.pair.count[pair.count.order], las = 2, horiz = TRUE, cex.names = 1)

Chromatin State at TSS and Gene Expression

We looked at the effect of the chromatin state right at the TSS and gene expression. This is a littl different than the other effects of gene expression we’ve looked at. Here we first identified the genes for which different strains had either of two states at the TSS. We then divided the expression for the strains with the first state at the TSS, and the strains with the second state at the TSS. The boxplots show overall differences in expression for the strains with each given state at the TSS.

The figure below shows the expression of all genes across all strains based on the state present at the TSS. Genes with state 7 at the TSS have the highest expression overall.

matched.expr <- expr[match(rownames(tss.states), names(expr))]
has.expr <- which(sapply(matched.expr, length) == 9)
sub.expr <- Reduce("rbind", matched.expr[has.expr])
rownames(sub.expr) <- names(matched.expr)[has.expr]
sub.tss <- tss.states[match(rownames(sub.expr), rownames(tss.states)),]
expr.vector <- unlist(sub.expr)
tss.vector <- unlist(sub.tss)
expr.by.tss <- lapply(1:9, function(x) expr.vector[which(tss.vector == x)])
boxplot(expr.by.tss)

for(i in 1:length(paired.tss.expr)){
    if(is.interactive){quartz()}
    cat("###", names(paired.tss.expr[i]), "\n")
    box.col <- rep("gray", num.states)
    box.col[state_pairs[i,]] <- "red"
    plot(as.vector(paired.tss.expr[[i]][[2]])~as.factor(as.vector(paired.tss.expr[[i]][[1]])), 
         main = paste("State", unique(state_pairs[i,])), xlab = "State", ylab = "Expression",
        col = box.col)
    cat("\n\n")
}

1_1

1_2

2_2

1_3

2_3

3_3

1_4

2_4

3_4

4_4

1_5

2_5

3_5

4_5

5_5

1_6

2_6

3_6

4_6

5_6

6_6

1_7

2_7

3_7

4_7

5_7

6_7

7_7

1_8

2_8

3_8

4_8

5_8

6_8

7_8

8_8

1_9

2_9

3_9

4_9

5_9

6_9

7_9

8_9

9_9

cat("### All Single States\n")

All Single States

single.locale <- which(apply(state_pairs, 1, function(x) length(unique(x))) == 1)
boxplot(lapply(paired.tss.expr[single.locale], function(x) x[[2]]))
abline(h = median(unlist(paired.tss.expr[single.locale])))

cat("\n\n")

Chromatin State Enrichment

We looked at functional enrichments for genes with each pair of states present at the TSS.

Some of these genes have more than two states at the TSS, so the groups are overlapping somewhat, but using more categories was a bit too disorganized.

Creyghton et al. 2010 found that although H3K4me1 marks cell type-specific enhancers, it marks all enhancers whether they are poised or active. They showed that the presence of H3K27ac distinguishes active enhancers (H3K27ac present) from poised (H3K27ac absent). They suggested that the pattern of these enhancer states could give us clues as to the developmental potential of a cell, or its potential to respond to stimuli.

Can we use this information to look at enhancer patterns across the strains to see which strains could potentially respond to, or are responding to a stimulus, with inflammation, for example?

Let’s think about a case study gene: Irf5. This gene is associated with cytokine response. CAST and PWK have marks of a poised enhancer at the TSS, while the other strains have marks associated with active enhancers. Does this mean that Irf5 has the potential to be expressed in CAST and PWK with the right stimulus, but currently is not, whereas the other strains are all expressing that gene without any stimulus?

What are the enrichments of genes for which some strains have poised enhancers (3, 4, 8), and others have active enhancers (5,6,7) at the TSS?

enrich.file <- here("Results", "chQTL", "State_Pair_Enrichment.RDS")

if(!file.exists(enrich.file)){
    state_enrich <- vector(mode = "list", length = length(all.pair.co.occur))
    names(state_enrich) <- names(all.pair.co.occur)
    for(i in 1:length(all.pair.co.occur)){
        enrichment <- gost(all.pair.co.occur[[i]], organism = "mmusculus", sources = "GO")
        if(!is.null(enrichment)){
            state_enrich[[i]] <- enrichment
        }
    }
    saveRDS(state_enrich, enrich.file)
}else{
    state_enrich <- readRDS(enrich.file)
}
for(i in 1:length(state_enrich)){
    if(length(state_enrich[[i]]) > 0){
        if(is.interactive){quartz()}
        cat("###", names(state_enrich[i]), "\n")
        #plot.enrichment(state_enrich[[i]], 30, plot.label = paste(names(state_enrich)[i], length(all.pair.co.occur[[i]]), "genes"))
        par(mfrow = c(1,2))
        plot.enrichment.wordcloud(enrichment = state_enrich[[i]], 10,
         max.term.size = 2000)
        mtext(paste0("State ", names(all.pair.co.occur)[i], ": ", 
        length(all.pair.co.occur[[i]]), " genes"), side = 3, outer = TRUE,
        line = -2.5)
        cat("\n\n")
    }
}

1_1

1_2

2_2

1_3

2_3

3_3

1_4

2_4

3_4

4_4

1_5

2_5

4_5

5_5

1_6

2_6

3_6

5_6

6_6

1_7

3_7

4_7

5_7

6_7

7_7

2_8

3_8

4_8

6_8

7_8

8_8

1_9

2_9

3_9

6_9

7_9

8_9

state_gene_names <- lapply(all.pair.co.occur, 
    function(x) transcript.info[match(x, 
    transcript.info[,"ensembl_gene_id"]),"external_gene_name"])

Group Enrichment

#pdf("~/Desktop/group_enrich.pdf", width = 9, height = 12)
test <- plot.enrichment.group(state_enrich, transformation = "sqrt")

#dev.off()

pdf(file.path(results.dir, "TSS_Enrichment_Projections.pdf"), height = 15, width = 15)
plot_bipartite(test)
dev.off()
## quartz_off_screen 
##                 2
state_gene_names$"2_5"


states <- "3_7"
states <- "7_7"

if(is.interactive){quartz()}
pair.locale <- which(names(state_enrich) == states)
plot.enrichment(state_enrich[[pair.locale]], 30, 
    plot.label = paste(names(state_enrich)[pair.locale], 
    length(all.pair.co.occur[[pair.locale]]), "genes"),
    max.term.size = 2000)

par(mfrow = c(1,2))
plot.enrichment.wordcloud(enrichment = state_enrich[[i]], 10)

gene.name = "Rdh16f1"
id <- transcript.info[which(transcript.info[,"external_gene_name"] == gene.name)[1],"ensembl_gene_id"]
id.locale <- which(names(chrom.mats) == id)
get_tss_states(chrom.mats[[id.locale]])

Use the shiny chromatin viewer to see examples.

Intragenic states

Most of the states (1, 2, 3, 7, 8, and 9) have their strongest effects at the TSS, which is easy to isolate. However, states, 4, 5, and 6 have their primary effects in the intragenic region. It would be interesting to look at genes that specifically have these states intragenically, but I’m not sure how to do that. I am going to test some code here.

get_intragenic_state_count <- function(chrom.mat, state){
    if(length(chrom.mat) == 1){return(rep(NA, 9))}
    intragenic.idx <- intersect(which(as.numeric(rownames(chrom.mat)) > 0), which(as.numeric(rownames(chrom.mat)) < 1))
    intra.chrom <- chrom.mat[intragenic.idx,,drop=FALSE]
    state.idx <- lapply(1:ncol(intra.chrom), function(x) which(intra.chrom[,x] == state))
    state.count <- sapply(state.idx, length)/nrow(intra.chrom)
    return(state.count)
}

min.prop <- 0.1

for(state in 4:6){
    cat(paste("### State", state, "Intragenic Enrichment\n"))
    intragenic.prop <- t(sapply(chrom.mats, function(x) get_intragenic_state_count(x, state)))
    colnames(intragenic.prop) <- colnames(chrom.mats[[1]])

    no.state <- lapply(1:ncol(intragenic.prop), function(x) which(intragenic.prop[,x] == 0))
    names(no.state) <- colnames(intragenic.prop)
    some.state <- lapply(1:ncol(intragenic.prop), function(x) which(intragenic.prop[,x] > min.prop))
    names(some.state) <- colnames(intragenic.prop)

    state.no.enrich.file <- file.path(results.dir, paste0("State_", state, "_intragenic_enrichment_none.RDS"))
    if(!file.exists(state.no.enrich.file)){
        no.enrich <- lapply(no.state, 
            function(x) gost(rownames(intragenic.prop)[x], organism = "mmusculus", 
            sources = c("GO", "KEGG", "REACTOME")))
        saveRDS(no.enrich, state.no.enrich.file)
    }else{
    no.enrich <- readRDS(state.no.enrich.file)
    }

    state.some.enrich.file <- file.path(results.dir, paste0("State_", state, "_intragenic_enrichment_some.RDS"))
    if(!file.exists(state.some.enrich.file)){
        some.enrich <- lapply(some.state, 
            function(x) gost(rownames(intragenic.prop)[x], organism = "mmusculus", 
            sources = c("GO", "KEGG", "REACTOME")))
        saveRDS(some.enrich, state.some.enrich.file)
    }else{
    some.enrich <- readRDS(state.some.enrich.file)
    }

    if(is.interactive){quartz()}
    plot.enrichment.group(some.enrich, max.term.size = 2000, n.terms = 30, 
    plot.label = paste("Enrichment of genes with some intragenic state", state))

    if(is.interactive){quartz()}
    plot.enrichment.group(no.enrich, max.term.size = 2000, n.terms = 30,
    plot.label = paste("Enrichment of genes with no intragenic state", state))

    some.strains <- which(apply(intragenic.prop, 1, function(x) length(which(x == 0)) > 0 && length(which(x > 0)) > 0))
    enrich <- gost(rownames(intragenic.prop)[some.strains], organism = "mmusculus",
    sources = c("GO", "KEGG", "REACTOME"))
    if(is.interactive){quartz()}
    par(mfrow = c(1,2))
    plot.enrichment.wordcloud(enrich, num.terms = 30)
    mtext(paste("Enrichment of genes with state", state, "in some strains and not others"),
    side = 3, outer = TRUE, line = -2.5)
    cat("\n\n")
}